Latent profile Analysis (LPA) is a type of latent variable model that can be use to identify latent classes or mixtures in a dataset, based on a set of continuous input variables (Gibson, 1959; Oberski, 2016).
pacman::p_load(dplyr, summarytools, car, mclust, tidyLPA, reshape2, tidyverse, plotly, semPlot, semTools, lavaan, texreg, lme4)
load("input/data/data.RData")
Estimación utilizando la media de las cinco olas por cada indicador
conf <- data %>%
dplyr::select(c02_rec_w01, c02_rec_w02, c02_rec_w03, c02_rec_w04, c02_rec_w05,
c03_rec_w01, c03_rec_w02, c03_rec_w03, c03_rec_w04, c03_rec_w05) %>%
rowwise() %>%
mutate(conf1 = mean(c(c02_rec_w01, c02_rec_w02, c02_rec_w03, c02_rec_w04, c02_rec_w05), na.rm=T),
conf2 = mean(c(c03_rec_w01, c03_rec_w02, c03_rec_w03, c03_rec_w04, c03_rec_w05), na.rm=T))
conf <- conf %>% dplyr::select(conf1, conf2)
conf %>% dplyr::select(conf1, conf2) %>% sjlabelled::as_label(.) %>% dfSummary(, graph.col = FALSE)
## Data Frame Summary
## conf
## Dimensions: 1491 x 2
## Duplicates: 1371
##
## ----------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Valid Missing
## ---- ----------- ----------------------- -------------------- ---------- ---------
## 1 conf1 Mean (sd) : 1.2 (0.4) 14 distinct values 1491 0
## [numeric] min < med < max: (100.0%) (0.0%)
## 1 < 1 < 3
## IQR (CV) : 0.4 (0.3)
##
## 2 conf2 Mean (sd) : 1.4 (0.5) 17 distinct values 1491 0
## [numeric] min < med < max: (100.0%) (0.0%)
## 1 < 1.4 < 3
## IQR (CV) : 0.8 (0.3)
## ----------------------------------------------------------------------------------
BIC <- mclustBIC(conf)
plot(BIC)
summary(BIC)
## Best BIC values:
## EEV,9 EEV,8 EEV,6
## BIC -1028.136 -1034.279667 -1570.4444
## BIC diff 0.000 -6.143846 -542.3086
mod1 <- Mclust(conf, modelNames = "EEV", G = 3, x = BIC)
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -1078.809 1491 13 -2252.612 -2496.649
##
## Clustering table:
## 1 2 3
## 121 101 1269
ICL <- mclustICL(conf)
plot(ICL)
summary(ICL)
## Best ICL values:
## EEV,8 EEV,9 EEV,5
## ICL -2002.922 -2157.6490 -2323.6377
## ICL diff 0.000 -154.7275 -320.7161
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
mutate(Mean = round(Mean, 2))
p <- means %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
labs(x = NULL, y = "Standardized mean") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
layout(legend = list(orientation = "h", y = 1.2))
conf <- data %>%
dplyr::select(c02_rec_w01, c02_rec_w02, c02_rec_w03, c02_rec_w04, c02_rec_w05,
c03_rec_w01, c03_rec_w02, c03_rec_w03, c03_rec_w04, c03_rec_w05) %>%
rowwise() %>%
mutate(conf1 = mean(c(c02_rec_w01, c02_rec_w02, c02_rec_w03, c02_rec_w04, c02_rec_w05), na.rm=T),
conf2 = mean(c(c03_rec_w01, c03_rec_w02, c03_rec_w03, c03_rec_w04, c03_rec_w05), na.rm=T))
conf <- conf %>% dplyr::select(conf1, conf2)
conf %>% dplyr::select(conf1, conf2) %>% sjlabelled::as_label(.) %>% dfSummary(, graph.col = FALSE)
## Data Frame Summary
## conf
## Dimensions: 1491 x 2
## Duplicates: 1371
##
## ----------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Valid Missing
## ---- ----------- ----------------------- -------------------- ---------- ---------
## 1 conf1 Mean (sd) : 1.2 (0.4) 14 distinct values 1491 0
## [numeric] min < med < max: (100.0%) (0.0%)
## 1 < 1 < 3
## IQR (CV) : 0.4 (0.3)
##
## 2 conf2 Mean (sd) : 1.4 (0.5) 17 distinct values 1491 0
## [numeric] min < med < max: (100.0%) (0.0%)
## 1 < 1.4 < 3
## IQR (CV) : 0.8 (0.3)
## ----------------------------------------------------------------------------------
diversidad <- data %>%
dplyr::select(c06_04_w01, c06_04_w03,
c06_05_w01, c06_05_w03,
c06_06_w01, c06_06_w03) %>%
rowwise() %>%
mutate(div1 = mean(c(c06_04_w01, c06_04_w03), na.rm=T),
div2 = mean(c(c06_05_w01, c06_05_w03), na.rm=T),
div3 = mean(c(c06_06_w01, c06_06_w03), na.rm=T))
diversidad <- diversidad %>% dplyr::select(div1, div2, div3) %>% na.omit()
diversidad %>% select(div1, div2, div3) %>% sjlabelled::as_label(.) %>% summarytools::dfSummary(, graph.col = FALSE)
## Data Frame Summary
## diversidad
## Dimensions: 1476 x 3
## Duplicates: 1139
##
## ----------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Valid Missing
## ---- ----------- ----------------------- -------------------- ---------- ---------
## 1 div1 Mean (sd) : 2.7 (1.1) 1.00 : 182 (12.3%) 1476 0
## [numeric] min < med < max: 1.50 : 113 ( 7.7%) (100.0%) (0.0%)
## 1 < 3 < 5 2.00 : 231 (15.7%)
## IQR (CV) : 1.5 (0.4) 2.50 : 204 (13.8%)
## 3.00 : 254 (17.2%)
## 3.50 : 200 (13.6%)
## 4.00 : 198 (13.4%)
## 4.50 : 60 ( 4.1%)
## 5.00 : 34 ( 2.3%)
##
## 2 div2 Mean (sd) : 3 (1) 1.00 : 102 ( 6.9%) 1476 0
## [numeric] min < med < max: 1.50 : 71 ( 4.8%) (100.0%) (0.0%)
## 1 < 3 < 5 2.00 : 189 (12.8%)
## IQR (CV) : 1.5 (0.3) 2.50 : 194 (13.1%)
## 3.00 : 309 (20.9%)
## 3.50 : 239 (16.2%)
## 4.00 : 227 (15.4%)
## 4.50 : 95 ( 6.4%)
## 5.00 : 50 ( 3.4%)
##
## 3 div3 Mean (sd) : 2.6 (1) 1.00 : 190 (12.9%) 1476 0
## [numeric] min < med < max: 1.50 : 136 ( 9.2%) (100.0%) (0.0%)
## 1 < 2.5 < 5 2.00 : 270 (18.3%)
## IQR (CV) : 1 (0.4) 2.50 : 211 (14.3%)
## 3.00 : 303 (20.5%)
## 3.50 : 166 (11.2%)
## 4.00 : 147 (10.0%)
## 4.50 : 41 ( 2.8%)
## 5.00 : 12 ( 0.8%)
## ----------------------------------------------------------------------------------
BIC <- mclustBIC(diversidad)
plot(BIC)
summary(BIC)
## Best BIC values:
## EEV,9 EEV,8 VVE,2
## BIC -9730.906 -10515.874 -10745.542
## BIC diff 0.000 -784.968 -1014.636
mod1 <- Mclust(diversidad, modelNames = "EEV", G = 3, x = BIC)
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -5433.47 1476 23 -11034.77 -11816.62
##
## Clustering table:
## 1 2 3
## 138 686 652
ICL <- mclustICL(diversidad)
plot(ICL)
summary(ICL)
## Best ICL values:
## EEV,9 VVV,2 EEE,1
## ICL -10121.5 -11234.02 -11250.735
## ICL diff 0.0 -1112.52 -1129.238
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
mutate(Mean = round(Mean, 2))
p <- means %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
labs(x = NULL, y = "Standardized mean") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
layout(legend = list(orientation = "h", y = 1.2))
data <- data %>%
rowwise() %>%
mutate(conf_interpersonal_w01 =
mean(c(c02_rec_w01, c03_rec_w01), na.rm = T))
data <- data %>%
rowwise() %>%
mutate(conf_interpersonal_w02 =
mean(c(c02_rec_w02, c03_rec_w02), na.rm = T))
data <- data %>%
rowwise() %>%
mutate(conf_interpersonal_w03 =
mean(c(c02_rec_w03, c03_rec_w03), na.rm = T))
data <- data %>%
rowwise() %>%
mutate(conf_interpersonal_w04 =
mean(c(c02_rec_w04, c03_rec_w04), na.rm = T))
data <- data %>%
rowwise() %>%
mutate(conf_interpersonal_w05 =
mean(c(c02_rec_w05, c03_rec_w05), na.rm = T))
conf <- data %>% dplyr::select(conf_interpersonal_w01, conf_interpersonal_w02, conf_interpersonal_w03, conf_interpersonal_w04, conf_interpersonal_w05) %>% na.omit()
BIC <- mclustBIC(conf)
plot(BIC)
summary(BIC)
## Best BIC values:
## EEV,7 EEV,6 EEI,5
## BIC -5720.825 -6995.977 -7659.932
## BIC diff 0.000 -1275.151 -1939.107
mod1 <- Mclust(conf, modelNames = "EEV", G = 3, x = BIC)
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -4584.565 1462 52 -9548.082 -10224.57
##
## Clustering table:
## 1 2 3
## 180 430 852
ICL <- mclustICL(conf)
plot(ICL)
summary(ICL)
## Best ICL values:
## EEV,7 EEI,5 EEV,6
## ICL -6461.459 -7744.000 -7926.921
## ICL diff 0.000 -1282.541 -1465.461
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
mutate(Mean = round(Mean, 2))
p <- means %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
labs(x = NULL, y = "Standardized mean") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
layout(legend = list(orientation = "h", y = 1.2))
cfa_1 <- '
diversidad_w01 =~ c06_04_w01 + c06_05_w01 + c06_06_w01
'
cfa_2 <- '
diversidad_w03 =~ c06_04_w03 + c06_05_w03 + c06_06_w03
'
fit_1 <- cfa(cfa_1,data=data,missing="ML",estimator="MLR")
## Warning in lav_data_full(data = data, group = group, cluster = cluster, : lavaan WARNING: some cases are empty and will be ignored:
## 219 266 392 437 497 499 504 519 587 606 636 665 675 725 803 804 808 840 1464 1473 1481
fit_2 <- cfa(cfa_2,data=data,missing="ML",estimator="MLR")
## Warning in lav_data_full(data = data, group = group, cluster = cluster, : lavaan WARNING: some cases are empty and will be ignored:
## 314 328 418 495 497 524 962 1169 1187 1201 1455 1472
p1 <- predict(fit_1)
p2 <- predict(fit_2)
scores_diversidad = as.data.frame(p1)
scores_diversidad2 = as.data.frame(p2)
# Merge with factor scores
data = as.data.frame(cbind(data, scores_diversidad, scores_diversidad2))
diversidad <- data %>% select(diversidad_w01, diversidad_w03) %>% na.omit()
# Check
diversidad %>% dplyr::select(diversidad_w01, diversidad_w03) %>% sjlabelled::as_label(.) %>% dfSummary(, graph.col = FALSE)
## Data Frame Summary
## diversidad
## Dimensions: 1459 x 2
## Duplicates: 535
##
## -----------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Valid Missing
## ---- ---------------- ------------------------ --------------------- ---------- ---------
## 1 diversidad_w01 Mean (sd) : 0 (0.8) 154 distinct values 1459 0
## [numeric] min < med < max: (100.0%) (0.0%)
## -1.4 < 0.1 < 1.8
## IQR (CV) : 1.2 (615.4)
##
## 2 diversidad_w03 Mean (sd) : 0 (0.8) 139 distinct values 1459 0
## [numeric] min < med < max: (100.0%) (0.0%)
## -1.4 < 0.1 < 1.8
## IQR (CV) : 1.2 (156.5)
## -----------------------------------------------------------------------------------------
BIC <- mclustBIC(diversidad)
plot(BIC)
summary(BIC)
## Best BIC values:
## EVV,8 EVI,8 EVI,9
## BIC -6331.28 -6360.32808 -6361.32995
## BIC diff 0.00 -29.04825 -30.05011
mod1 <- Mclust(diversidad, modelNames = "EVI", G = 3, x = BIC)
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EVI (diagonal, equal volume, varying shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -3322.609 1459 12 -6732.644 -7439.059
##
## Clustering table:
## 1 2 3
## 388 564 507
ICL <- mclustICL(diversidad)
plot(ICL)
summary(ICL)
## Best ICL values:
## EVE,6 EVV,8 EEE,1
## ICL -6692.18 -6709.33934 -6787.78941
## ICL diff 0.00 -17.15973 -95.60981
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
mutate(Mean = round(Mean, 2))
p <- means %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
labs(x = NULL, y = "Standardized mean") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
layout(legend = list(orientation = "h", y = 1.2))